Run download_data.Rmd and percentage_of_regional_richness.Rmd First!
city_data
length(city_data$city_gdp_per_population[!is.na(city_data$city_gdp_per_population)])
[1] 30
length(city_data$percentage_urban_area_as_open_public_spaces[!is.na(city_data$percentage_urban_area_as_open_public_spaces)])
[1] 61
length(city_data$happiness_future_life[!is.na(city_data$happiness_future_life)])
[1] 65
length(city_data$mean_population_exposure_to_pm2_5_2019[!is.na(city_data$mean_population_exposure_to_pm2_5_2019)])
[1] 131
fetch_city_data_for <- function(pool_name, include_city_name = F, include_pool_size = T) {
results_filename <- paste(paste('percentage_of_regional_richness__output_', pool_name, 'city', 'richness', 'intercept', sep = "_"), "csv", sep = ".")
results <- read_csv(results_filename)
joined <- left_join(city_data, results)
pool_size_col_name <- paste(pool_name, 'pool', 'size', sep = "_")
required_columns <- c("population_growth", "rainfall_monthly_min", "rainfall_annual_average", "rainfall_monthly_max", "temperature_annual_average", "temperature_monthly_min", "temperature_monthly_max", "happiness_negative_effect", "happiness_positive_effect", "happiness_future_life", "number_of_biomes", "realm", "biome_name", "region_20km_includes_estuary", "region_50km_includes_estuary", "region_100km_includes_estuary", "city_includes_estuary", "region_20km_average_pop_density", "region_50km_average_pop_density", "region_100km_average_pop_density", "city_max_pop_density", "city_average_pop_density", "mean_population_exposure_to_pm2_5_2019", "region_20km_cultivated", "region_20km_urban", "region_50km_cultivated", "region_50km_urban", "region_100km_cultivated", "region_100km_urban", "region_20km_elevation_delta", "region_20km_mean_elevation", "region_50km_elevation_delta", "region_50km_mean_elevation", "region_100km_elevation_delta", "region_100km_mean_elevation", "city_elevation_delta", "city_mean_elevation", "urban", "shrubs", "permanent_water", "open_forest", "herbaceous_wetland", "herbaceous_vegetation", "cultivated", "closed_forest", "share_of_population_within_400m_of_open_space", "percentage_urban_area_as_streets", "percentage_urban_area_as_open_public_spaces_and_streets", "percentage_urban_area_as_open_public_spaces", "city_gdp_per_population", "city_ndvi", "city_ssm", "city_susm", "region_20km_ndvi", "region_20km_ssm", "region_20km_susm", "region_50km_ndvi", "region_50km_ssm", "region_50km_susm", "region_100km_ndvi", "region_100km_ssm", "region_100km_susm", "city_percentage_protected", "region_20km_percentage_protected", "region_50km_percentage_protected", "region_100km_percentage_protected")
if (include_pool_size) {
required_columns <- append(c(pool_size_col_name), required_columns)
}
if (include_city_name) {
required_columns <- append(c("name"), required_columns)
}
required_columns <- append(c("response"), required_columns)
joined[,required_columns]
}
merlin_city_data <- fetch_city_data_for('merlin')
── Column specification ─────────────────────────────────────────────────────────────────────────────────────
cols(
name = col_character(),
response = col_double()
)
Joining, by = "name"
merlin_city_data
library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
library(reshape2)
library(rpart)
library(ggplot2)
Attaching package: ‘ggplot2’
The following object is masked from ‘package:randomForest’:
margin
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ tibble 3.1.2 ✓ dplyr 1.0.7
✓ tidyr 1.1.3 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.1
✓ purrr 0.3.4
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::combine() masks randomForest::combine()
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
x ggplot2::margin() masks randomForest::margin()
library(multcomp)
Loading required package: mvtnorm
Loading required package: survival
Loading required package: TH.data
Loading required package: MASS
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
Attaching package: ‘TH.data’
The following object is masked from ‘package:MASS’:
geyser
library(car)
Loading required package: carData
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'car':
method from
influence.merMod lme4
cooks.distance.influence.merMod lme4
dfbeta.influence.merMod lme4
dfbetas.influence.merMod lme4
Attaching package: ‘car’
The following object is masked from ‘package:dplyr’:
recode
The following object is masked from ‘package:purrr’:
some
merlin_city_data_fixed <- rfImpute(response ~ ., merlin_city_data)
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 16.87 93.56 |
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 17.14 95.06 |
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 16.85 93.46 |
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 17.32 96.08 |
| Out-of-bag |
Tree | MSE %Var(y) |
300 | 16.91 93.80 |
merlin_city_data_fixed
source('./helper__random_forest_selection_functions.R')
scale_parameter_name <- function(scale, postscript) {
paste('region', paste(scale, 'km', sep = ''), postscript, sep = '_')
}
scale_parameters <- function(postscript) {
c(scale_parameter_name(20, postscript), scale_parameter_name(50, postscript), scale_parameter_name(100, postscript))
}
scales_parameters_without <- function(scale_to_exclude, postscript) {
scales <- scale_parameters(postscript)
scales[scales != scale_parameter_name(scale_to_exclude, postscript)]
}
select_scales <- function(urban, cultivated, elevation_delta, mean_elevation, average_pop_density, includes_estuary, ssm, susm, ndvi, percentage_protected) {
append(
append(
append(
append(
scales_parameters_without(scale_to_exclude = urban, postscript = 'urban'),
scales_parameters_without(scale_to_exclude = cultivated, postscript = 'cultivated')
),
append(
scales_parameters_without(scale_to_exclude = elevation_delta, postscript = 'elevation_delta'),
scales_parameters_without(scale_to_exclude = mean_elevation, postscript = 'mean_elevation')
)
),
append(
append(
scales_parameters_without(scale_to_exclude = average_pop_density, postscript = 'average_pop_density'),
scales_parameters_without(scale_to_exclude = includes_estuary, postscript = 'includes_estuary')
),
append(
scales_parameters_without(scale_to_exclude = ssm, postscript = 'ssm'),
scales_parameters_without(scale_to_exclude = susm, postscript = 'susm')
)
)
),
append(
scales_parameters_without(scale_to_exclude = ndvi, postscript = 'ndvi'),
scales_parameters_without(scale_to_exclude = percentage_protected, postscript = 'percentage_protected')
)
)
}
select_scales(urban = 20, cultivated = 100, elevation_delta = 20, mean_elevation = 100, average_pop_density = NA, includes_estuary = NA, ssm = 20, susm = 20, ndvi = 100, percentage_protected = NA)
[1] "region_50km_urban" "region_100km_urban"
[3] "region_20km_cultivated" "region_50km_cultivated"
[5] "region_50km_elevation_delta" "region_100km_elevation_delta"
[7] "region_20km_mean_elevation" "region_50km_mean_elevation"
[9] "region_20km_average_pop_density" "region_50km_average_pop_density"
[11] "region_100km_average_pop_density" "region_20km_includes_estuary"
[13] "region_50km_includes_estuary" "region_100km_includes_estuary"
[15] "region_50km_ssm" "region_100km_ssm"
[17] "region_50km_susm" "region_100km_susm"
[19] "region_20km_ndvi" "region_50km_ndvi"
[21] "region_20km_percentage_protected" "region_50km_percentage_protected"
[23] "region_100km_percentage_protected"
select_scales(urban = , cultivated = , elevation_delta = , mean_elevation = , average_pop_density = , includes_estuary = , ssm = , susm = , ndvi =, percentage_protected = )
select_variables_from_random_forest(merlin_city_data_fixed)
[1] "merlin_pool_size"
[2] "biome_name"
[3] "realm"
[4] "region_100km_ssm"
[5] "region_50km_elevation_delta"
[6] "region_20km_elevation_delta"
[7] "temperature_monthly_min"
[8] "region_50km_ssm"
[9] "city_gdp_per_population"
[10] "temperature_annual_average"
[11] "region_50km_susm"
[12] "region_20km_urban"
[13] "rainfall_monthly_min"
[14] "permanent_water"
[15] "region_100km_elevation_delta"
[16] "region_50km_urban"
[17] "region_20km_cultivated"
[18] "region_100km_cultivated"
[19] "shrubs"
[20] "happiness_positive_effect"
[21] "region_20km_ndvi"
[22] "share_of_population_within_400m_of_open_space"
[23] "city_percentage_protected"
[24] "herbaceous_wetland"
[25] "region_100km_urban"
[26] "region_50km_cultivated"
[27] "region_20km_average_pop_density"
[28] "region_50km_average_pop_density"
[29] "temperature_monthly_max"
[30] "city_max_pop_density"
[31] "region_100km_average_pop_density"
[32] "city_mean_elevation"
[33] "city_ndvi"
[34] "city_average_pop_density"
[35] "rainfall_monthly_max"
[36] "happiness_future_life"
[37] "region_20km_susm"
[38] "region_50km_percentage_protected"
[39] "mean_population_exposure_to_pm2_5_2019"
[40] "city_susm"
[41] "region_100km_susm"
[42] "city_elevation_delta"
[43] "region_50km_ndvi"
[44] "region_100km_percentage_protected"
[45] "region_20km_ssm"
[46] "region_20km_percentage_protected"
[47] "rainfall_annual_average"
[48] "region_20km_mean_elevation"
[49] "percentage_urban_area_as_open_public_spaces_and_streets"
[50] "urban"
[51] "region_100km_ndvi"
[52] "city_ssm"
[53] "cultivated"
[54] "region_50km_mean_elevation"
[55] "region_100km_mean_elevation"
[56] "population_growth"
[57] "happiness_negative_effect"
[58] "percentage_urban_area_as_streets"
[59] "closed_forest"
[60] "open_forest"
[61] "percentage_urban_area_as_open_public_spaces"
exclude_merlin <- !names(merlin_city_data_fixed) %in% select_scales(urban = 20, cultivated = 100, elevation_delta = 50, mean_elevation = 20, average_pop_density = 50, includes_estuary = NA, ssm = 100, susm = 50, ndvi = 20, percentage_protected = 50)
merlin_city_data_fixed_single_scale <- merlin_city_data_fixed[,exclude_merlin]
merlin_city_data_fixed_single_scale
select_variables_from_random_forest(merlin_city_data_fixed_single_scale)
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi", "temperature_monthly_max")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi", "temperature_monthly_max", "city_average_pop_density")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi", "temperature_monthly_max", "city_average_pop_density", "region_50km_average_pop_density")])
create_fifty_rows_of_oob(merlin_city_data_fixed[,c("response", "merlin_pool_size", "biome_name", "realm", "region_100km_ssm", "temperature_annual_average", "temperature_monthly_min", "region_50km_elevation_delta", "rainfall_monthly_min", "permanent_water", "region_50km_susm", "region_20km_ndvi", "region_20km_urban", "shrubs", "city_gdp_per_population", "happiness_positive_effect", "city_percentage_protected", "region_100km_cultivated", "share_of_population_within_400m_of_open_space", "rainfall_monthly_max", "city_ndvi", "temperature_monthly_max", "city_average_pop_density", "region_50km_average_pop_density", "rainfall_annual_average")])
“merlin_pool_size”, “biome_name”, “realm”
birdlife_city_data <- fetch_city_data_for('birdlife')
birdlife_city_data
ggplot(birdlife_city_data, aes(response)) + geom_histogram(binwidth = 1)
birdlife_city_data_fixed <- rfImpute(response ~ ., birdlife_city_data)
birdlife_city_data_fixed
select_variables_from_random_forest(birdlife_city_data_fixed)
exclude_birdlife <- !names(birdlife_city_data_fixed) %in% select_scales(urban = 100, cultivated = 100, elevation_delta = 20, mean_elevation = 100, average_pop_density = 20, includes_estuary = NA, ssm = 50, susm = 100, ndvi = 100, percentage_protected = 100)
birdlife_city_data_fixed_single_scale <- birdlife_city_data_fixed[,exclude_birdlife]
birdlife_city_data_fixed_single_scale
select_variables_from_random_forest(birdlife_city_data_fixed_single_scale)
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets", "rainfall_annual_average")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets", "rainfall_annual_average", "city_susm")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets", "rainfall_annual_average", "city_susm", "region_100km_ndvi")])
create_fifty_rows_of_oob(birdlife_city_data_fixed[,c("response", "population_growth", "region_50km_ssm", "birdlife_pool_size", "biome_name", "region_100km_cultivated", "city_ndvi", "temperature_monthly_min", "percentage_urban_area_as_open_public_spaces", "rainfall_monthly_min", "region_100km_susm", "region_20km_average_pop_density", "city_ssm", "permanent_water", "rainfall_monthly_max", "region_100km_urban", "temperature_annual_average", "percentage_urban_area_as_open_public_spaces_and_streets", "region_20km_elevation_delta", "share_of_population_within_400m_of_open_space", "shrubs", "mean_population_exposure_to_pm2_5_2019", "city_average_pop_density", "percentage_urban_area_as_streets", "rainfall_annual_average", "city_susm", "region_100km_ndvi", "happiness_future_life")])
“population_growth”, “region_50km_ssm”, “birdlife_pool_size”
| So…. |
|---|
| Merlin: “merlin_pool_size”, “biome_name”, “realm” Birdlife: “population_growth”, “region_50km_ssm”, “birdlife_pool_size” |
library(boot)
Attaching package: ‘boot’
The following object is masked from ‘package:car’:
logit
The following object is masked from ‘package:survival’:
aml
merlin_city_data_named <- fetch_city_data_for('merlin', T)
── Column specification ─────────────────────────────────────────────────────────────────────────────────────
cols(
name = col_character(),
response = col_double()
)
Joining, by = "name"
birdlife_city_data_named <- fetch_city_data_for('birdlife', T)
── Column specification ─────────────────────────────────────────────────────────────────────────────────────
cols(
name = col_character(),
response = col_double()
)
Joining, by = "name"
| Use cross validation and dropping terms to find best model |
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 14.51828
– Can we drop one?
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.8059
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.46984
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + population_growth + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 14.39671
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 14.30384
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 20.17595
– drop populaion_growth (14.30384)
– can we drop another?
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + region_50km_ssm + biome_name, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.56925
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + region_50km_ssm + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.24412
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 14.20889
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ region_50km_ssm + biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 20.17595
– drop ssm (14.20889)
– can we drop another?
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + biome_name, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.38282
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.10131
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ biome_name + realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 20.56763
– drop biome (13.10131)
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.29241
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 19.48513
cv.glm(merlin_city_data_fixed_no_boreal, glm(formula = response ~ merlin_pool_size * realm, data = merlin_city_data_fixed_no_boreal))$delta[1]
[1] 13.85502
| – best model with merlin is pool size + realm (CV error 13.10131) |
merlin_city_data_fixed$realm <- factor(merlin_city_data_fixed$realm, levels = c("Palearctic", "Nearctic", "Australasia", "Indomalayan", "Neotropic", "Afrotropic"))
summary(glm(data = merlin_city_data_fixed, formula = response ~ merlin_pool_size + realm))
Call:
glm(formula = response ~ merlin_pool_size + realm, data = merlin_city_data_fixed)
Deviance Residuals:
Min 1Q Median 3Q Max
-8.5046 -1.6225 -0.5191 1.4001 16.3762
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.155064 0.927938 6.633 8.03e-10 ***
merlin_pool_size -0.028445 0.003408 -8.346 9.02e-14 ***
realmNearctic 3.089566 1.035703 2.983 0.00341 **
realmAustralasia -0.304872 2.107524 -0.145 0.88520
realmIndomalayan 1.888475 0.823167 2.294 0.02339 *
realmNeotropic 3.355053 0.997078 3.365 0.00101 **
realmAfrotropic 1.602668 1.215952 1.318 0.18981
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 12.36947)
Null deviance: 2469.6 on 136 degrees of freedom
Residual deviance: 1608.0 on 130 degrees of freedom
AIC: 742.19
Number of Fisher Scoring iterations: 2
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.817509
– can we drop a variable?
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 6.1237
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.492536
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + biome_name + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.764969
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + region_50km_ssm + biome_name + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.666362
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ population_growth + region_50km_ssm + biome_name + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 7.061169
– drop biome (5.492536)
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.686036
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + population_growth + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.499406
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + region_50km_ssm + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.38609
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ population_growth + region_50km_ssm + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 6.690224
– drop population growth (5.38609)
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + region_50km_ssm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.577701
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.38033
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ region_50km_ssm + realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 6.637624
– drop ssm (5.38033)
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 5.613765
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
[1] 6.746644
cv.glm(birdlife_city_data_fixed_no_boreal, glm(formula = response ~ birdlife_pool_size * realm, data = birdlife_city_data_fixed_no_boreal))$delta[1]
| – so best model with birdlife is pool size + realm |
summary(glm(data = birdlife_city_data_fixed, formula = response ~ birdlife_pool_size + realm))
Call:
glm(formula = response ~ birdlife_pool_size + realm, data = birdlife_city_data_fixed)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.9219 -1.2809 -0.3423 0.8298 9.5036
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.16482 0.64778 4.886 2.98e-06 ***
birdlife_pool_size -0.01493 0.00252 -5.926 2.62e-08 ***
realmNearctic 0.58805 0.63743 0.923 0.35796
realmAustralasia 1.04651 1.38301 0.757 0.45060
realmIndomalayan 1.87907 0.60496 3.106 0.00233 **
realmNeotropic 2.26051 0.70591 3.202 0.00171 **
realmAfrotropic 2.54804 0.89206 2.856 0.00499 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 5.241433)
Null deviance: 865.45 on 136 degrees of freedom
Residual deviance: 681.39 on 130 degrees of freedom
AIC: 624.56
Number of Fisher Scoring iterations: 2
summary(glm(formula = response ~ merlin_pool_size + realm, data = merlin_city_data_fixed_no_boreal))
Call:
glm(formula = response ~ merlin_pool_size + realm, data = merlin_city_data_fixed_no_boreal)
Deviance Residuals:
Min 1Q Median 3Q Max
-8.5389 -1.7132 -0.5164 1.3898 16.2335
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.823312 1.406157 5.564 1.46e-07 ***
merlin_pool_size -0.028685 0.003401 -8.434 5.79e-14 ***
realmAustralasia -1.923665 2.294340 -0.838 0.403
realmIndomalayan 0.288224 1.193798 0.241 0.810
realmNearctic 1.499468 1.327678 1.129 0.261
realmNeotropic 1.767199 1.293308 1.366 0.174
realmPalearctic -1.490884 1.214536 -1.228 0.222
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 12.28537)
Null deviance: 2458.9 on 135 degrees of freedom
Residual deviance: 1584.8 on 129 degrees of freedom
AIC: 735.91
Number of Fisher Scoring iterations: 2
ggplot(merlin_city_data_fixed_no_boreal, aes(x = merlin_pool_size, y = response)) + geom_point() + geom_smooth(method = "glm", se = F) + facet_wrap(~ realm)
`geom_smooth()` using formula 'y ~ x'
ggplot(birdlife_city_data_fixed_no_boreal, aes(x = birdlife_pool_size, y = response)) + geom_point() + geom_smooth(method = "glm", se = F) + facet_wrap(~ realm)
`geom_smooth()` using formula 'y ~ x'
birdlife.fit <- glm(data = birdlife_city_data_fixed_no_boreal, formula = response ~ birdlife_pool_size + realm)
summary(birdlife.fit)
Call:
glm(formula = response ~ birdlife_pool_size + realm, data = birdlife_city_data_fixed_no_boreal)
Deviance Residuals:
Min 1Q Median 3Q Max
-5.9240 -1.2855 -0.3158 0.8601 9.4227
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.724170 1.182215 4.842 3.61e-06 ***
birdlife_pool_size -0.014963 0.002513 -5.955 2.31e-08 ***
realmAustralasia -1.504499 1.508076 -0.998 0.32033
realmIndomalayan -0.670291 0.785239 -0.854 0.39490
realmNearctic -1.963875 0.918121 -2.139 0.03432 *
realmNeotropic -0.288102 0.832533 -0.346 0.72987
realmPalearctic -2.474092 0.891147 -2.776 0.00632 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 5.210381)
Null deviance: 857.07 on 135 degrees of freedom
Residual deviance: 672.14 on 129 degrees of freedom
AIC: 619.25
Number of Fisher Scoring iterations: 2
with(summary(birdlife.fit), 1 - deviance/null.deviance)
[1] 0.2157718
plot(birdlife.fit)
birdlife_city_data_fixed_no_boreal[c(16, 53, 116, 123), c("name", "birdlife_pool_size")]
ggplot(birdlife_city_data_fixed_no_boreal, aes(x = birdlife_pool_size, y = response)) +
geom_point(size=1) +
geom_smooth(method = "glm", se = F) +
geom_text(aes(label = name), data = birdlife_city_data_fixed_no_boreal[c(16, 53, 123),], size = 3, position = "dodge", vjust = "inward", hjust = "inward", color = "red", angle=-15) +
geom_point(data = birdlife_city_data_fixed_no_boreal[c(16, 53, 123),], color = "red") +
facet_wrap(~ realm) +
theme_bw() +
ylab("City Random Effect Intercept") + xlab("Regional Pool Size") + labs(title = "Birdlife")
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
ggsave("city_effect_richness__output__birdlife.jpg")
Saving 7.29 x 4.51 in image
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
merlin.fit <- glm(data = merlin_city_data_fixed_no_boreal, formula = response ~ merlin_pool_size + realm)
summary(merlin.fit)
Call:
glm(formula = response ~ merlin_pool_size + realm, data = merlin_city_data_fixed_no_boreal)
Deviance Residuals:
Min 1Q Median 3Q Max
-8.5389 -1.7132 -0.5164 1.3898 16.2335
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.823312 1.406157 5.564 1.46e-07 ***
merlin_pool_size -0.028685 0.003401 -8.434 5.79e-14 ***
realmAustralasia -1.923665 2.294340 -0.838 0.403
realmIndomalayan 0.288224 1.193798 0.241 0.810
realmNearctic 1.499468 1.327678 1.129 0.261
realmNeotropic 1.767199 1.293308 1.366 0.174
realmPalearctic -1.490884 1.214536 -1.228 0.222
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 12.28537)
Null deviance: 2458.9 on 135 degrees of freedom
Residual deviance: 1584.8 on 129 degrees of freedom
AIC: 735.91
Number of Fisher Scoring iterations: 2
with(summary(merlin.fit), 1 - deviance/null.deviance)
[1] 0.3554797
plot(merlin.fit)
city_data[c(7, 24, 77), c("name", "birdlife_pool_size", "merlin_pool_size")]
ggplot(merlin_city_data_fixed_no_boreal, aes(x = merlin_pool_size, y = response)) +
geom_point(size = 1) +
geom_smooth(method = "glm", se = F) +
geom_text(aes(label = name), data = merlin_city_data_fixed_no_boreal[c(7, 24, 77),], size = 3, position = "dodge", vjust = "inward", hjust = "inward", color = "red", angle=-15) +
geom_point(data = merlin_city_data_fixed_no_boreal[c(7, 24, 77),], color = "red") +
facet_wrap(~ realm) +
theme_bw() +
ylab("City Random Effect Intercept") + xlab("Regional Pool Size") + labs(title = "eBird")
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
ggsave("city_effect_richness__output__merlin.jpg")
Saving 7.29 x 4.51 in image
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
merlin_city_data_fixed_no_boreal$residuals <- resid(merlin.fit)
ggplot(merlin_city_data_fixed_no_boreal, aes(y = response, x = residuals)) +
geom_smooth(method = "lm", se = F) +
geom_point(aes(color = realm)) +
geom_text(aes(label = name), data = merlin_city_data_fixed_no_boreal[c(7, 24, 77),], size = 4, position = "dodge", vjust = "inward", hjust = "inward") +
labs(title = "Merlin", subtitle = paste("Correlation", cor(merlin_city_data_fixed_no_boreal$residuals, merlin_city_data_fixed_no_boreal$response))) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
birdlife_city_data_fixed_no_boreal$residuals <- resid(birdlife.fit)
ggplot(birdlife_city_data_fixed_no_boreal, aes(y = response, x = residuals)) +
geom_smooth(method = "lm", se = F, alpha = 0.5) +
geom_point(aes(color = realm)) +
geom_text(aes(label = name), data = birdlife_city_data_fixed_no_boreal[c(16, 53, 124),], size = 4, position = "dodge", vjust = "inward", hjust = "inward") +
labs(title = "Birdlife", subtitle = paste("Correlation", cor(birdlife_city_data_fixed_no_boreal$residuals, birdlife_city_data_fixed_no_boreal$response))) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`
birdlife_city_data_fixed_no_boreal$residuals_of_fit <- resid(lm(response ~ residuals, birdlife_city_data_fixed_no_boreal))
ggplot(birdlife_city_data_fixed_no_boreal, aes(x = realm, y = residuals_of_fit)) + geom_boxplot()
birdlife.aov <- aov(birdlife_city_data_fixed_no_boreal$residuals_of_fit ~ birdlife_city_data_fixed_no_boreal$realm)
summary(birdlife.aov)
Df Sum Sq Mean Sq F value Pr(>F)
birdlife_city_data_fixed_no_boreal$realm 5 0.16 0.0315 0.022 1
Residuals 130 184.77 1.4213
merlin_city_data_fixed_no_boreal$residuals_of_fit <- resid(lm(response ~ residuals, merlin_city_data_fixed_no_boreal))
ggplot(merlin_city_data_fixed_no_boreal, aes(x = realm, y = residuals_of_fit)) + geom_boxplot()
merlin.aov <- aov(merlin_city_data_fixed_no_boreal$residuals_of_fit ~ merlin_city_data_fixed_no_boreal$realm)
summary(merlin.aov)
Df Sum Sq Mean Sq F value Pr(>F)
merlin_city_data_fixed_no_boreal$realm 5 0.2 0.040 0.006 1
Residuals 130 873.9 6.722
ggplot(merlin_city_data_fixed_no_boreal, aes(residuals)) + geom_histogram(binwidth = 1)
ggplot(birdlife_city_data_fixed_no_boreal, aes(residuals)) + geom_histogram(binwidth = 1)
shapiro.test(birdlife_city_data_fixed_no_boreal$response)
Shapiro-Wilk normality test
data: birdlife_city_data_fixed_no_boreal$response
W = 0.93733, p-value = 8.842e-06
shapiro.test(birdlife_city_data_fixed_no_boreal$residuals)
Shapiro-Wilk normality test
data: birdlife_city_data_fixed_no_boreal$residuals
W = 0.94253, p-value = 2.095e-05
shapiro.test(birdlife_city_data_fixed_no_boreal$residuals_of_fit)
Shapiro-Wilk normality test
data: birdlife_city_data_fixed_no_boreal$residuals_of_fit
W = 0.90395, p-value = 7.391e-08
shapiro.test(merlin_city_data_fixed_no_boreal$response)
Shapiro-Wilk normality test
data: merlin_city_data_fixed_no_boreal$response
W = 0.95308, p-value = 0.0001366
shapiro.test(merlin_city_data_fixed_no_boreal$residuals)
Shapiro-Wilk normality test
data: merlin_city_data_fixed_no_boreal$residuals
W = 0.93318, p-value = 4.568e-06
shapiro.test(merlin_city_data_fixed_no_boreal$residuals_of_fit)
Shapiro-Wilk normality test
data: merlin_city_data_fixed_no_boreal$residuals_of_fit
W = 0.94707, p-value = 4.596e-05
Variances of groups are NOT significantly different:
bartlett.test(merlin_city_data_fixed_no_boreal$residuals, merlin_city_data_fixed_no_boreal$realm)
Bartlett test of homogeneity of variances
data: merlin_city_data_fixed_no_boreal$residuals and merlin_city_data_fixed_no_boreal$realm
Bartlett's K-squared = 12.245, df = 5, p-value = 0.03159
leveneTest(merlin_city_data_fixed_no_boreal$residuals, merlin_city_data_fixed_no_boreal$realm)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 1.22 0.3033
130
Variances of groups are NOT significantly different:
bartlett.test(birdlife_city_data_fixed_no_boreal$residuals, birdlife_city_data_fixed_no_boreal$realm)
Bartlett test of homogeneity of variances
data: birdlife_city_data_fixed_no_boreal$residuals and birdlife_city_data_fixed_no_boreal$realm
Bartlett's K-squared = 16.504, df = 5, p-value = 0.005544
leveneTest(birdlife_city_data_fixed_no_boreal$residuals, birdlife_city_data_fixed_no_boreal$realm)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 1.4422 0.2134
130
Variances of groups are significantly different:
bartlett.test(merlin_city_data_fixed_no_boreal$residuals_of_fit, merlin_city_data_fixed_no_boreal$realm)
Bartlett test of homogeneity of variances
data: merlin_city_data_fixed_no_boreal$residuals_of_fit and merlin_city_data_fixed_no_boreal$realm
Bartlett's K-squared = 63.781, df = 5, p-value = 2.006e-12
leveneTest(merlin_city_data_fixed_no_boreal$residuals_of_fit, merlin_city_data_fixed_no_boreal$realm)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 7.1996 5.587e-06 ***
130
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Variances of groups are significantly different:
bartlett.test(birdlife_city_data_fixed_no_boreal$residuals_of_fit, birdlife_city_data_fixed_no_boreal$realm)
Bartlett test of homogeneity of variances
data: birdlife_city_data_fixed_no_boreal$residuals_of_fit and birdlife_city_data_fixed_no_boreal$realm
Bartlett's K-squared = 77.857, df = 5, p-value = 2.355e-15
leveneTest(birdlife_city_data_fixed_no_boreal$residuals_of_fit, birdlife_city_data_fixed_no_boreal$realm)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 5 9.9859 4.199e-08 ***
130
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
| Check AIC |
AIC(
glm(data = merlin_city_data_fixed, formula = response ~ merlin_pool_size + population_growth + region_50km_ssm + biome_name + realm),
glm(data = merlin_city_data_fixed, formula = response ~ merlin_pool_size + realm)
)
AIC(
glm(data = birdlife_city_data_fixed, formula = response ~ birdlife_pool_size + population_growth + region_50km_ssm + biome_name + realm),
glm(data = birdlife_city_data_fixed, formula = response ~ birdlife_pool_size + realm)
)
| Compare outliers between two pool sizes |
ggplot(city_data, aes(x = merlin_pool_size, y = birdlife_pool_size)) +
geom_point() +
geom_smooth(se = F, method = "lm") +
geom_text(aes(label = name), data = city_data[c(16, 53, 124, 7, 24, 77),], size = 3, position = "dodge", vjust = "inward", hjust = "inward", color = "red", angle=-15) +
geom_point(data = city_data[c(16, 53, 124, 7, 24, 77),], color = "red") +
facet_wrap (~ realm) +
xlab("eBird Pool Size") + ylab("Birdlife Pool Size")
`geom_smooth()` using formula 'y ~ x'
Warning: Width not defined. Set with `position_dodge(width = ?)`